1 - Wrangle Bikes

2. [1 point] Unzip the file and then import the day.csv file and give the data frame the name bikes.

library(tidyverse)
library(readr)
library(modelr)
day <- read_csv("./day.csv")

3. [3 points] The temperature, “feels like” temperature, humidity, and windspeed were all normalized (so that they were between the values of 0 and 1). You are to create new versions these variables so they correspond to their original recorded values by multiplying the existing values by their maximum values described on the UCI Repository. Please call these variables temp_orig, feel_temp_orig, hum_orig, windspeed_orig.

day <- (day %>%
          mutate(temp_orig = temp * 39,
                 feel_temp_orig = atemp * 50,
                 hum_orig = hum * 100,
                 windspeed_org = windspeed * 67))
head(day)
## # A tibble: 6 × 20
##   instant dteday     season    yr  mnth holiday weekday workingday weathersit
##     <dbl> <date>      <dbl> <dbl> <dbl>   <dbl>   <dbl>      <dbl>      <dbl>
## 1       1 2011-01-01      1     0     1       0       6          0          2
## 2       2 2011-01-02      1     0     1       0       0          0          2
## 3       3 2011-01-03      1     0     1       0       1          1          1
## 4       4 2011-01-04      1     0     1       0       2          1          1
## 5       5 2011-01-05      1     0     1       0       3          1          1
## 6       6 2011-01-06      1     0     1       0       4          1          1
## # … with 11 more variables: temp <dbl>, atemp <dbl>, hum <dbl>,
## #   windspeed <dbl>, casual <dbl>, registered <dbl>, cnt <dbl>,
## #   temp_orig <dbl>, feel_temp_orig <dbl>, hum_orig <dbl>, windspeed_org <dbl>

4. Categorical variables to factors. The categorical variables in this data have been coded with integer values and are read as integer values by R. You’ll convert them to non-numeric values as factors (categorical variables that have an associated ordering) with the following:

a) [3 points] Convert the weathersit variable to be a factor with the values “clear”, “mist”, “light precip”, and “heavy precip”, based on the variable definitions. This factor should have the ordering “clear”, “mist”, “light precip”, “heavy precip”.

day$weathersit <- factor(day$weathersit, 
                         levels = c(1, 2, 3, 4), 
                         labels = c('clear', 'mist', 'light precip', 'heavy precip'))
head(day)
## # A tibble: 6 × 20
##   instant dteday     season    yr  mnth holiday weekday workingday weathersit
##     <dbl> <date>      <dbl> <dbl> <dbl>   <dbl>   <dbl>      <dbl> <fct>     
## 1       1 2011-01-01      1     0     1       0       6          0 mist      
## 2       2 2011-01-02      1     0     1       0       0          0 mist      
## 3       3 2011-01-03      1     0     1       0       1          1 clear     
## 4       4 2011-01-04      1     0     1       0       2          1 clear     
## 5       5 2011-01-05      1     0     1       0       3          1 clear     
## 6       6 2011-01-06      1     0     1       0       4          1 clear     
## # … with 11 more variables: temp <dbl>, atemp <dbl>, hum <dbl>,
## #   windspeed <dbl>, casual <dbl>, registered <dbl>, cnt <dbl>,
## #   temp_orig <dbl>, feel_temp_orig <dbl>, hum_orig <dbl>, windspeed_org <dbl>

b) [2 points] Convert the workingday variable to be a factor with the values “work day” and “weekend/holiday”, and the levels “work day”, “weekend/holiday”.

day$workingday <- factor(day$workingday, 
                         levels = c(1, 0), 
                         labels = c('work day', 'weekend/holiday'))
head(day)
## # A tibble: 6 × 20
##   instant dteday     season    yr  mnth holiday weekday workingday    weathersit
##     <dbl> <date>      <dbl> <dbl> <dbl>   <dbl>   <dbl> <fct>         <fct>     
## 1       1 2011-01-01      1     0     1       0       6 weekend/holi… mist      
## 2       2 2011-01-02      1     0     1       0       0 weekend/holi… mist      
## 3       3 2011-01-03      1     0     1       0       1 work day      clear     
## 4       4 2011-01-04      1     0     1       0       2 work day      clear     
## 5       5 2011-01-05      1     0     1       0       3 work day      clear     
## 6       6 2011-01-06      1     0     1       0       4 work day      clear     
## # … with 11 more variables: temp <dbl>, atemp <dbl>, hum <dbl>,
## #   windspeed <dbl>, casual <dbl>, registered <dbl>, cnt <dbl>,
## #   temp_orig <dbl>, feel_temp_orig <dbl>, hum_orig <dbl>, windspeed_org <dbl>

c) [2 points] Convert the yr variable to be a factor with the values 2011 and 2012, and the levels 2011, 2012.

day$yr <- factor(day$yr,
                 levels = 0:1,
                 labels = c('2011', '2012'))
head(day)
## # A tibble: 6 × 20
##   instant dteday     season yr     mnth holiday weekday workingday    weathersit
##     <dbl> <date>      <dbl> <fct> <dbl>   <dbl>   <dbl> <fct>         <fct>     
## 1       1 2011-01-01      1 2011      1       0       6 weekend/holi… mist      
## 2       2 2011-01-02      1 2011      1       0       0 weekend/holi… mist      
## 3       3 2011-01-03      1 2011      1       0       1 work day      clear     
## 4       4 2011-01-04      1 2011      1       0       2 work day      clear     
## 5       5 2011-01-05      1 2011      1       0       3 work day      clear     
## 6       6 2011-01-06      1 2011      1       0       4 work day      clear     
## # … with 11 more variables: temp <dbl>, atemp <dbl>, hum <dbl>,
## #   windspeed <dbl>, casual <dbl>, registered <dbl>, cnt <dbl>,
## #   temp_orig <dbl>, feel_temp_orig <dbl>, hum_orig <dbl>, windspeed_org <dbl>

5. Working with time. Although we have the variable dteday which holds the date, it’s considered a character string rather than a date.

a) [3 points] Convert the dteday to be recognized as dates by R.

# dteday is already recognized as a date.
head(day)
## # A tibble: 6 × 20
##   instant dteday     season yr     mnth holiday weekday workingday    weathersit
##     <dbl> <date>      <dbl> <fct> <dbl>   <dbl>   <dbl> <fct>         <fct>     
## 1       1 2011-01-01      1 2011      1       0       6 weekend/holi… mist      
## 2       2 2011-01-02      1 2011      1       0       0 weekend/holi… mist      
## 3       3 2011-01-03      1 2011      1       0       1 work day      clear     
## 4       4 2011-01-04      1 2011      1       0       2 work day      clear     
## 5       5 2011-01-05      1 2011      1       0       3 work day      clear     
## 6       6 2011-01-06      1 2011      1       0       4 work day      clear     
## # … with 11 more variables: temp <dbl>, atemp <dbl>, hum <dbl>,
## #   windspeed <dbl>, casual <dbl>, registered <dbl>, cnt <dbl>,
## #   temp_orig <dbl>, feel_temp_orig <dbl>, hum_orig <dbl>, windspeed_org <dbl>

b) [3 points] Use the values of dteday to create a new variable, days, that is the number of days since January 1, 2011. Note that you can subtract two dates. Keep in mind that the value stored in days should be stored as a number.

day <- (day %>%
           mutate(days = as.numeric(day$dteday - as.Date("11-01-01", "%y-%m-%d"))))
head(day)
## # A tibble: 6 × 21
##   instant dteday     season yr     mnth holiday weekday workingday    weathersit
##     <dbl> <date>      <dbl> <fct> <dbl>   <dbl>   <dbl> <fct>         <fct>     
## 1       1 2011-01-01      1 2011      1       0       6 weekend/holi… mist      
## 2       2 2011-01-02      1 2011      1       0       0 weekend/holi… mist      
## 3       3 2011-01-03      1 2011      1       0       1 work day      clear     
## 4       4 2011-01-04      1 2011      1       0       2 work day      clear     
## 5       5 2011-01-05      1 2011      1       0       3 work day      clear     
## 6       6 2011-01-06      1 2011      1       0       4 work day      clear     
## # … with 12 more variables: temp <dbl>, atemp <dbl>, hum <dbl>,
## #   windspeed <dbl>, casual <dbl>, registered <dbl>, cnt <dbl>,
## #   temp_orig <dbl>, feel_temp_orig <dbl>, hum_orig <dbl>, windspeed_org <dbl>,
## #   days <dbl>

2 - Models Relating Bike Rentals and Temperature

Now you’ll investigate different models predicting relationships between the number of rentals (cnt) and the temperature (temp_orig).

1) [2 points] Recreate the following image using your revised bikes data. Be careful to recreate the title, subtitle, and axes labels too (look up labs).

ggplot(day) +
  geom_point(aes(x = temp_orig, y = cnt, col = temp)) +
  labs(x = 'Bike rentals',
       y = 'Temperature in Celcius', 
       title = 'Bike Rentals in DC in 2011 and 2012',
       subtitle = 'Warmer weather leads to more rentals')

2) Linear versus Quadratic Models

a) [1 point] Make a new data frame called bt (for bike temp) that contains the temp_orig and cnt variables from your modified bikes data frame.

bt <- data.frame(day['temp_orig'], day['cnt'])
head(bt)
##   temp_orig  cnt
## 1 13.422513  985
## 2 14.175642  801
## 3  7.658196 1349
## 4  7.800000 1562
## 5  8.851323 1600
## 6  7.969572 1606

b) [3 points] Make two models, one linear and one quadratic, where the number of rentals, cnt, is the dependent (response) variable and the original temperature in Celsius, temp_orig, is the independent (predictor) variable. Save these with different names.

# Linear
line_day <- lm(cnt ~ temp_orig, data = bt)
line_coe <- coefficients(line_day)

# Quadratic
bt2 <- (bt %>%
          mutate(tsqrt = temp_orig ** 2))
quad_day <- model_matrix(cnt ~ temp_orig + tsqrt, data = bt2)
pred <- lm(cnt ~ temp_orig + tsqrt, data = bt2)
quad_day <- quad_day %>% 
              add_predictions(pred)

c) [5 points] Make a grid data frame with values of temp_orig (cut into 20 evenly spaced values) and separately add the predictions from both models to this data frame. NOTE: Do not use gather_predictions here, as Dr. McNelis wants the model predictions to be in two different columns. The predictions from the linear model should be labeled “linpred” and those from the quadratic model labelled as “quadpred”. Note, the add_predictions function allows you to specify the name of variable containing the predictions.

temp_grid <- (bt %>% 
                data_grid(temp_orig = seq_range(temp_orig, 20)))

temp_grid <- (temp_grid %>%
                add_predictions(line_day, var = 'linpred'))

temp_grid <- (temp_grid %>%
                mutate(tsqrt = temp_orig ^ 2) %>%
                add_predictions(pred, var = 'quadpred') %>%
                select(-tsqrt))
temp_grid
## # A tibble: 20 × 3
##    temp_orig linpred quadpred
##        <dbl>   <dbl>    <dbl>
##  1      2.31   1607.    -689.
##  2      3.95   1888.     113.
##  3      5.60   2168.     862.
##  4      7.25   2449.    1556.
##  5      8.90   2729.    2197.
##  6     10.5    3010.    2785.
##  7     12.2    3290.    3318.
##  8     13.8    3571.    3798.
##  9     15.5    3851.    4224.
## 10     17.1    4132.    4597.
## 11     18.8    4412.    4915.
## 12     20.4    4693.    5180.
## 13     22.1    4973.    5391.
## 14     23.7    5254.    5549.
## 15     25.4    5534.    5653.
## 16     27.0    5815.    5703.
## 17     28.7    6095.    5699.
## 18     30.3    6376.    5642.
## 19     32.0    6656.    5531.
## 20     33.6    6937.    5366.

d) [4 points] Make a plot that includes a scatter plot (as above) of temp_orig versus cnt and add curves for your linear and quadratic models from your grid data frame. Have a nice title, subtitle, and axes labels on your graph.

ggplot(day, aes(x = temp_orig, y = cnt, col = temp)) +
  geom_point(alpha = .3) +
  geom_line(data = temp_grid, aes(x = temp_orig, y = quadpred), color = "red") +
  geom_line(data = temp_grid, aes(x = temp_orig, y = linpred), color = 'blue') +
  labs(x = 'Temperature in Celcius',
       y = 'Bike rentals', 
       title = "Comparing Models",
       subtitle = "Blue is linear and red is quadratic.") 

e) [5 points] Add the residuals from the two models to your bt data frame, then create a residual plot broken out by model. Consider adding the nice break/horizontal line at 0 to make that value clear. Again, include a nice title, subtitle, and axes labels on your graphs.

bt <- (bt %>% 
         add_residuals(line_day, var = 'line') %>%
         mutate(tsqrt = temp_orig ^ 2) %>%
         add_residuals(pred, var = 'quad') %>%
         select(-tsqrt))
bt_long <- (bt %>%
         pivot_longer(cols = c('line', 'quad'), 
                      names_to = 'model', 
                      values_to = 'resids'))
bt_long
## # A tibble: 1,462 × 4
##    temp_orig   cnt model resids
##        <dbl> <dbl> <chr>  <dbl>
##  1     13.4    985 line  -2515.
##  2     13.4    985 quad  -2697.
##  3     14.2    801 line  -2827.
##  4     14.2    801 quad  -3089.
##  5      7.66  1349 line  -1170.
##  6      7.66  1349 quad   -372.
##  7      7.8   1562 line   -981.
##  8      7.8   1562 quad   -215.
##  9      8.85  1600 line  -1122.
## 10      8.85  1600 quad   -581.
## # … with 1,452 more rows
ggplot(bt_long) +
  geom_point(aes(x = temp_orig, y = resids, col = model))  +
  geom_ref_line(h = 0) +
  facet_grid(~model) +
  labs(x = 'Temperature in Celcius',
       y = 'Residual', 
       title = "Residuals of the linear and quadratic models",
       subtitle = "Blue is linear and red is quadratic.") +
  theme(legend.position = 'none')

f) [5 points] Based on what you can see that your models capture and what they do not capture, explain if one model is better than another or if they are equally good (or bad). Be very thorough in your explanation and make references to your visual representations of the curves and their residuals in your argument.

Overall the quadratic model seems like it represents the data better. The amount of rentals flattens out and decreases a little after 20 degrees, the linear model does not capture that at all. The linear model’s residuals also creates a pretty defined quadratic curve, whereas the quadratic model’s residuals are distributed better.

3 - Models Relating Bike Rentals and Time

Now you’ll investigate different models predicting relationships between the number of rentals (cnt) and time, in terms of the number of days since January 1, 2011 (days).

3.1 - Polynomial Models

1) [2 points] Recreate the following image using your revised bikes data. Be careful to recreate the title, subtitle, and axes labels too (look up labs).

ggplot(day) +
  geom_point(aes(x = days, y = cnt, col = temp_orig)) +
  labs(x = 'Days since January 1, 2011',
       y = 'Bike rentrals',
       title = 'Bike Rentals in DC, 2011 and 2012',
       subtitle = 'Rentals trends over time')

2) [1 point] Make a new data frame called bd (for bike days) that contains the days and cnt variables from your modified bikes data frame.

bd <- data.frame(day['days'], day['cnt'])

3) [7 points] Make a function called fit_order_n that takes an input of an integer n and returns a scatter plot of our data (like that in problem 1) with our nth order polynomial plotted along with the data. Make sure that the graph includes a TITLE that indicates the order of the polynomial (hint: check out help on the paste function).

fit_order_n <- function(n) {
  fit <- lm(cnt ~ poly(days, degree = n), data = bd)
  p_grid <- data_grid(bd, days)
  p_grid <- (bd %>% 
               gather_predictions(fit))
  
  ggplot(p_grid) +
    geom_point(aes(x = days, y = cnt), alpha = 0.2) + 
    geom_line(aes(x = days, y = pred), col = 'red')  +
    labs(x = 'Days since January 1, 2011',
         y = 'Bike rentrals',
         title = 'Estimating Bike Rentals in DC',
         subtitle = paste('using ', n, 
                          ifelse(n == 1, 'st', 
                                 ifelse(n == 2, 'nd', 
                                        ifelse(n == 3, 'rd', 'th'))), 
                          ' order polynomial', sep = ''))
}

4) [5 points] Use your function to help you determine the smallest value of n that seems to provide a good “fit” to our data. In your comments or in your markdown portion, be sure to add your comment about the best value of n you found.

# I couldn't get a for loop to work
fit_order_n(1)

fit_order_n(2)

fit_order_n(3)

fit_order_n(4)

fit_order_n(5)

fit_order_n(6)

fit_order_n(7)

fit_order_n(8)

fit_order_n(9)

fit_order_n(10)

Using a 4th order polynomial seems to be the minimum that has the same shape as the data data, but using a 6th order polynomial seems to be a substantial improvement over 4th and 5th, and then more or less equal to higher order polynomials.

3.2 - Sinusoidal Models

1)

B = 2 * pi / 365.25
mod3 <- lm(cnt ~ sin(B * days) + cos(B * days), data = day)

2) [2 points] Use the coefficients from the model above to determine the values of the phase shift φ, the amplitude A, and the midline, k. Indicate how you found these. Make sure you use the most precise value of the model coefficients that R stores.

coe <- coefficients(mod3)

phi <- atan(coe[[3]] / coe[[2]])
A <- coe[[2]] / cos(phi)
k = coe[[1]]

3) [4 points] Make a grid data frame with values of days, then use mutate to add a new column to grid called “sinepred” whose values are the values of A sin(B ∗ days + φ) + k.

sine_grid <- (day %>% 
                data_grid(days) %>%
                mutate(sinepred = A * sin(B * days + phi) + k))

4) [4 points] Make a plot that includes a plot of the scatter plot of our bike rentals over time and add the curve for our sinusoidal model predictions (in grid) to this plot. Have a nice title, subtitle, and axes labels on your graph.

ggplot(day) +
  geom_point(aes(x = days, y = cnt, col = temp_orig), alpha = 0.3) +
  geom_line(data = sine_grid, aes(x = days, y = sinepred), col = 'red') +
  labs(x = 'Days since January 1, 2011',
       y = 'Bike rentrals',
       title = 'Estimating Bike Rentals in DC',
       subtitle = 'Using a sinusoidal model')

5) [3 points] Make a copy of your bd data frame called bd2 and add the residuals associated with sinusoidal model as a new variable in this data frame.

bd2 <- (bd %>%
          gather_residuals(mod3))

6) [4 points] Create a residual plot associated with this sinusoidal model. Again, include a nice title, subtitle, and axes labels on your graphs.

ggplot(bd2) +
  geom_point(aes(x = days, y = resid), alpha = 0.5)  +
  geom_ref_line(h = 0) +
  labs(x = 'Days since January 01, 2011',
       y = 'Residual', 
       title = "Residuals of the update models predicting based off days") +
  theme(legend.position = 'none')

4 - Other Key Factors

I think the next best predictors to use could be season, mnth, dteday, and weathersit. I made a couple tibbles below to show the average for each of them. season would not be the best predictor, especially since mnth holds practically the same information and is more specific. dteday and mnth would be likely be similar to the model we made with days, so making a model using them as a predictor could be redundant.

I originally thought workingday would be a good predictor. I was expecting none working days to have a lower average, but working days actually end up having about 200 more bike rentals per day. In retrospect that makes sense, as people use them to commute. Either way, I don’t think there is a significant enough difference to really be a good predictor.

day
## # A tibble: 731 × 21
##    instant dteday     season yr     mnth holiday weekday workingday   weathersit
##      <dbl> <date>      <dbl> <fct> <dbl>   <dbl>   <dbl> <fct>        <fct>     
##  1       1 2011-01-01      1 2011      1       0       6 weekend/hol… mist      
##  2       2 2011-01-02      1 2011      1       0       0 weekend/hol… mist      
##  3       3 2011-01-03      1 2011      1       0       1 work day     clear     
##  4       4 2011-01-04      1 2011      1       0       2 work day     clear     
##  5       5 2011-01-05      1 2011      1       0       3 work day     clear     
##  6       6 2011-01-06      1 2011      1       0       4 work day     clear     
##  7       7 2011-01-07      1 2011      1       0       5 work day     mist      
##  8       8 2011-01-08      1 2011      1       0       6 weekend/hol… mist      
##  9       9 2011-01-09      1 2011      1       0       0 weekend/hol… clear     
## 10      10 2011-01-10      1 2011      1       0       1 work day     clear     
## # … with 721 more rows, and 12 more variables: temp <dbl>, atemp <dbl>,
## #   hum <dbl>, windspeed <dbl>, casual <dbl>, registered <dbl>, cnt <dbl>,
## #   temp_orig <dbl>, feel_temp_orig <dbl>, hum_orig <dbl>, windspeed_org <dbl>,
## #   days <dbl>
ggplot(day) +
  geom_point(aes(x = temp_orig, y = cnt, col = weathersit))

str(day)
## spec_tbl_df [731 × 21] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ instant       : num [1:731] 1 2 3 4 5 6 7 8 9 10 ...
##  $ dteday        : Date[1:731], format: "2011-01-01" "2011-01-02" ...
##  $ season        : num [1:731] 1 1 1 1 1 1 1 1 1 1 ...
##  $ yr            : Factor w/ 2 levels "2011","2012": 1 1 1 1 1 1 1 1 1 1 ...
##  $ mnth          : num [1:731] 1 1 1 1 1 1 1 1 1 1 ...
##  $ holiday       : num [1:731] 0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday       : num [1:731] 6 0 1 2 3 4 5 6 0 1 ...
##  $ workingday    : Factor w/ 2 levels "work day","weekend/holiday": 2 2 1 1 1 1 1 2 2 1 ...
##  $ weathersit    : Factor w/ 4 levels "clear","mist",..: 2 2 1 1 1 1 2 2 1 1 ...
##  $ temp          : num [1:731] 0.344 0.363 0.196 0.2 0.227 ...
##  $ atemp         : num [1:731] 0.364 0.354 0.189 0.212 0.229 ...
##  $ hum           : num [1:731] 0.806 0.696 0.437 0.59 0.437 ...
##  $ windspeed     : num [1:731] 0.16 0.249 0.248 0.16 0.187 ...
##  $ casual        : num [1:731] 331 131 120 108 82 88 148 68 54 41 ...
##  $ registered    : num [1:731] 654 670 1229 1454 1518 ...
##  $ cnt           : num [1:731] 985 801 1349 1562 1600 ...
##  $ temp_orig     : num [1:731] 13.42 14.18 7.66 7.8 8.85 ...
##  $ feel_temp_orig: num [1:731] 18.18 17.69 9.47 10.61 11.46 ...
##  $ hum_orig      : num [1:731] 80.6 69.6 43.7 59 43.7 ...
##  $ windspeed_org : num [1:731] 10.7 16.7 16.6 10.7 12.5 ...
##  $ days          : num [1:731] 0 1 2 3 4 5 6 7 8 9 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   instant = col_double(),
##   ..   dteday = col_date(format = ""),
##   ..   season = col_double(),
##   ..   yr = col_double(),
##   ..   mnth = col_double(),
##   ..   holiday = col_double(),
##   ..   weekday = col_double(),
##   ..   workingday = col_double(),
##   ..   weathersit = col_double(),
##   ..   temp = col_double(),
##   ..   atemp = col_double(),
##   ..   hum = col_double(),
##   ..   windspeed = col_double(),
##   ..   casual = col_double(),
##   ..   registered = col_double(),
##   ..   cnt = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
day %>%
  group_by(mnth) %>%
  summarise(avg = mean(cnt))
## # A tibble: 12 × 2
##     mnth   avg
##    <dbl> <dbl>
##  1     1 2176.
##  2     2 2655.
##  3     3 3692.
##  4     4 4485.
##  5     5 5350.
##  6     6 5772.
##  7     7 5564.
##  8     8 5664.
##  9     9 5767.
## 10    10 5199.
## 11    11 4247.
## 12    12 3404.
day %>%
  group_by(season) %>%
  summarise(avg = mean(cnt))
## # A tibble: 4 × 2
##   season   avg
##    <dbl> <dbl>
## 1      1 2604.
## 2      2 4992.
## 3      3 5644.
## 4      4 4728.
day %>%
  group_by(weathersit) %>%
  summarise(avg = mean(cnt))
## # A tibble: 3 × 2
##   weathersit     avg
##   <fct>        <dbl>
## 1 clear        4877.
## 2 mist         4036.
## 3 light precip 1803.
day %>%
  group_by(workingday) %>%
  summarise(avg = mean(cnt))
## # A tibble: 2 × 2
##   workingday        avg
##   <fct>           <dbl>
## 1 work day        4585.
## 2 weekend/holiday 4330.